seqlength = 2048
batchsize = 4
accumulate_steps = 9
train_tokens = 100000000
theoryflops = -1
epochs = 1
flashattn = True
tensor_parallel = 2
pipeline_parallel = 2
